import jieba
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE


# 读取数据
def loadArticle(fileName):
	'''
	读取原始自用数据集的测试文章
	:param fileName: 文件名
	:return: 处理之后的文章
	'''
	# 我们需要将其空格去掉
	with open(fileName, encoding='utf-8') as file:
		# 按行读取
		test_article = []
		for line in file.readlines():
			# 去除空格，以及换行符
			line = line.replace("<content>", "")
			line = line.replace("</content>", "")
			line = line.replace(" ", "")
			line = line.replace("　", "")
			line = line.strip()
			test_article.append(line)
	return test_article


test_article = loadArticle("mytest.txt")
texts = []

for text in test_article:
	# 数据预处理
	stop_words = set()
	with open('stopwords.txt', encoding='utf-8') as f:
		for line in f:
			stop_words.add(line.strip())
	text = ' '.join([word for word in jieba.cut(text) if word not in stop_words])
	texts.append(text)

# 文本表示
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(texts)

# 聚类算法
cluster = KMeans(n_clusters=5,init='k-means++',  n_init=10, random_state=0)
cluster.fit(X.toarray())

from sklearn.decomposition import PCA

# 使用 PCA 进行降维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

# 绘制聚类结果的散点图
plt.figure(figsize=(8, 8))
for i in range(cluster.n_clusters):
	plt.scatter(X_pca[cluster.labels_ == i, 0], X_pca[cluster.labels_ == i, 1], label='Cluster {}'.format(i))
plt.legend()
plt.show()

# 进行降维和可视化
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=0)
X_tsne = tsne.fit_transform(X.toarray())

# 绘制聚类结果的散点图
plt.figure(figsize=(8, 8))
for i in range(cluster.n_clusters):
	plt.scatter(X_tsne[cluster.labels_ == i, 0], X_tsne[cluster.labels_ == i, 1], label='Cluster {}'.format(i))
plt.legend()
plt.show()
# 输出聚类结果
print('Cluster labels:', cluster.labels_)
# 将不同类的文本保存到不同的 txt 文件中
for i in range(cluster.n_clusters):
	with open('k_means/cluster_{}.txt'.format(i), 'w', encoding='utf-8') as f:
		for j in range(len(texts)):
			if cluster.labels_[j] == i:
				f.write('{}\n'.format(texts[j]))
# 对每个类总结出关键词
for i in range(cluster.n_clusters):
	cluster_texts = []
	with open('k_means/cluster_{}.txt'.format(i), 'r', encoding='utf-8') as f:
		for line in f:
			cluster_texts.append(line.strip())
	cluster_words = []
	for text in cluster_texts:
		words = list(jieba.cut(text))
		words = [word for word in words if word not in stop_words and len(word) > 1]
		cluster_words.extend(words)
	word_count = {}
	for word in cluster_words:
		if word in word_count:
			word_count[word] += 1
		else:
			word_count[word] = 1
	sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
	top_words = [w[0] for w in sorted_words[:10]]
	print('k_means/Cluster {}: {}'.format(i, ', '.join(top_words)))
# 使用轮廓系数评估聚类效果
score = silhouette_score(X, cluster.labels_, metric='euclidean')
print('轮廓系数：', score)
# 使用Calinski-Harabasz指数评估聚类效果
score = calinski_harabasz_score(X.toarray(), cluster.labels_)
print('Calinski-Harabasz指数：', score)
# 使用Davies-Bouldin指数评估聚类效果
score = davies_bouldin_score(X.toarray(), cluster.labels_)
print('Davies-Bouldin指数：', score)
